import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC, SVR
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,precision_recall_curve,roc_curve,auc
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import label_binarize
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
import plotly.offline as pyo
pyo.init_notebook_mode()
# Local imports
from ipynb.fs.defs.task3_1 import DatasetManager
# Plotting
def plot_data(*traces,x=None,y=None,z=None,title="",x_label="",y_label="",name="",mode="markers",text=""):
fig = go.Figure(layout={
"title": title,
"xaxis": {"title": x_label},
"yaxis": {"title": y_label}
})
if z is None:
data = go.Scatter(
x=x,
y=y,
mode=mode,
name=name,
text=text
)
else:
data = go.Scatter3d(
x=x,
y=y,
z=z,
mode=mode,
name=name,
text=text
)
if x is not None and y is not None:
fig.add_trace(data)
for t in traces:
fig.add_trace(t)
return fig
def plot_bar_data(*bars, x=None, title="", x_label="", y_label=""):
fig = go.Figure(
layout={
"title": title,
"xaxis": {"title": x_label},
"yaxis": {"title": y_label},
"barmode": "group"
}, data=[
go.Bar(name=f"{bar[0]}", x=x, y=bar[1], text=bar[1], texttemplate="%{y:.4f}")
for bar in bars
])
return fig
def create_trace(x=None, y=None, z=None, name="", mode="lines", text="", marker_size=None):
if z is None:
trace = go.Scatter(
x=x,
y=y,
mode=mode,
name=name,
text=text,
marker=dict(size=marker_size)
)
else:
trace = go.Scatter3d(
x=x,
y=y,
z=z,
mode=mode,
name=name,
text=text,
marker=dict(size=marker_size)
)
return trace
def plot_collection(plots, rows, cols, subplot_titles, specs, title="", height=1000, width=1000):
# Initialise figure
fig = make_subplots(
rows=rows,
cols=cols,
subplot_titles=subplot_titles,
specs=specs
)
fig.update_layout({
"title": title,
"height": height,
"width": width,
})
# Add traces
for k in plots:
current_plot = plots[k]
n_traces = len(current_plot.data)
for i in range(n_traces):
fig.add_trace(current_plot.data[i], row=k[0], col=k[1])
# Update axes
for k in plots:
fig.update_xaxes(title_text=k[2], row=k[0], col=k[1])
fig.update_yaxes(title_text=k[3], row=k[0], col=k[1])
return fig
class ModelManager:
def __init__(self, feature_set, targets):
self._features = feature_set
self._targets = targets
self._train_and_test_sets = None
self._trained_model = None
self._best_hps = None
self._train_preds = None
self._train_preds_prob = None
self._test_preds = None
self._test_preds_prob = None
def get_features(self):
return self._feature_set
def get_targets(self):
return self._targets
def get_train_and_test_sets(self):
return self._train_and_test_sets
def get_trained_model(self):
return self._trained_model
def get_optimal_hyperparameters(self):
return self._best_hps
def split_dataset(self, train_size=0.8, test_size=0.2):
"""
Split dataset into a training and test set.
"""
# Splitting dataset using train_test_split
X_train, X_test, y_train, y_test = train_test_split(
self._features,
self._targets,
train_size=train_size,
test_size=test_size,
)
# Assigning class variables
self._train_and_test_sets = {
"X_train": X_train,
"X_test": X_test,
"y_train": y_train,
"y_test": y_test
}
def train_model(self, model_type, cv_folds=10):
"""
Function for training either a classifiction or regression
model and optimising hyperparameters using cross validation.
"""
assert self._train_and_test_sets != None, "You don't have your training and test sets."
# Getting training and test data
X_train = self._train_and_test_sets.get("X_train")
y_train = self._train_and_test_sets.get("y_train")
X_test = self._train_and_test_sets.get("X_test")
y_test = self._train_and_test_sets.get("y_test")
# Initialise model
if model_type == "clf":
estimator = SVC(probability=True)
print("Classifier model initialised...")
elif model_type == "reg":
estimator = SVR()
print("Regression model initialised...")
# Specify parameters ranges to be searched
parameter_grid = [{
"C": [1, 10, 100],
"gamma": [0.01, 0.1, 1],
}]
# Model fitting, cross-validation and hyperparameter optimisation using GridSearch
model = GridSearchCV(
estimator=estimator,
param_grid=parameter_grid,
cv=cv_folds,
refit=True
)
print("Fitting model and performing cross-validation...")
model.fit(X_train, y_train)
print("Model fitting and cross-validation complete...")
# Getting predictions
print("Making predictions...")
self._train_preds = model.predict(X_train)
self._test_preds = model.predict(X_test)
if model_type == "clf":
self._train_preds_prob = model.predict_proba(X_train)
self._test_preds_prob = model.predict_proba(X_test)
# Assigning class variables
self._trained_model = model
self._best_hps = model.best_params_
def _get_key_metric_plot_clf(self, y_train, y_test, train_preds, test_preds):
"""
Returns a bar plot that visualises key
scores for a classification model.
"""
# Calculating accuracies and scores
train_acc = accuracy_score(y_train, train_preds)
train_prec = precision_score(y_train, train_preds, average="weighted")
train_rec = recall_score(y_train, train_preds, average="weighted")
train_f1 = f1_score(y_train, train_preds, average="weighted")
test_acc = accuracy_score(y_test, test_preds)
test_prec = precision_score(y_test, test_preds, average="weighted")
test_rec = recall_score(y_test, test_preds, average="weighted")
test_f1 = f1_score(y_test, test_preds, average="weighted")
# Create key metric bar plot
x = ["Training Set", "Test Set"]
key_metric_plot = plot_bar_data(
("Accuracy", [train_acc, test_acc]),
("Precision", [train_prec, test_prec]),
("Recall", [train_rec, test_rec]),
("F1 Score", [train_f1, test_f1]),
x=x,
title="Key Metrics",
x_label="Dataset"
)
return key_metric_plot
def _get_key_metric_plot_reg(self, y_train, y_test, train_preds, test_preds):
"""
Returns a bar plot that visualises key
metrics for a regression model.
"""
# Calculating errors and scores
train_mse = mean_squared_error(y_train, train_preds)
train_mae = mean_absolute_error(y_train, train_preds)
train_r2 = r2_score(y_train, train_preds)
test_mse = mean_squared_error(y_test, test_preds)
test_mae = mean_absolute_error(y_test, test_preds)
test_r2 = r2_score(y_test, test_preds)
# Create key metric bar plot
x = ["Training Set", "Test Set"]
key_metric_plot = plot_bar_data(
("Mean Squared Error", [train_mse, test_mse]),
("Mean Absolute Error", [train_mae, test_mae]),
("R2 Score", [train_r2, test_r2]),
x=x,
title="Key Metrics",
x_label="Dataset"
)
return key_metric_plot
def _get_true_pred_plot(self, y_test, test_preds):
"""
Creates a scatter plot that shows predicted values
against true values.
"""
x = np.arange(y_test.shape[0])
y_test_trace = create_trace(x=x, y=y_test, name="True Values", mode="markers")
y_pred_trace = create_trace(x=x, y=test_preds, name="Predicted Values", mode="markers")
true_pred_plot = plot_data(y_test_trace, y_pred_trace, title="True vs Predicted Values")
return true_pred_plot
def _get_precision_recall_plot(self, y_test, test_preds_prob, encodings, classes=[0,1,2]):
"""
Creates precsion-recall curves for all
classes in a classification task.
"""
n_classes = len(classes)
# Binarise true test values.
y_test_bin = label_binarize(y_test, classes=classes)
# Creating traces
prec = dict()
rec = dict()
traces = []
for i in range(n_classes):
prec[i], rec[i], _ = precision_recall_curve(y_test_bin[:, i], test_preds_prob[:, i])
curve_area = auc(rec[i], prec[i])
new_trace = create_trace(
x=rec[i],
y=prec[i],
name=f"P-R for Class {i} ({encodings[i]}); AUC = {round(curve_area, 3)}"
)
traces.append(new_trace)
# Creating and returning plot
return plot_data(
*traces,
x=np.arange(0,1, 0.01),
title="Precision-Recall Curves",
x_label="Recall",
y_label="Precision",
)
def _get_roc_plot(self, y_test, test_preds_prob, encodings, classes=[0,1,2]):
"""
Creates ROC curves for all
classes in a classification task.
"""
n_classes = len(classes)
# Binarise true test set values
y_test_bin = label_binarize(y_test, classes=classes)
# Creating traces
fpr = dict()
tpr = dict()
traces = []
for i in range(n_classes):
fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], test_preds_prob[:, i])
curve_area = auc(fpr[i], tpr[i])
new_trace = create_trace(
x=fpr[i],
y=tpr[i],
name=f"ROC for Class {i} ({encodings[i]}); AUC = {round(curve_area, 3)}"
)
traces.append(new_trace)
# Creating and returning plot
return plot_data(
*traces,
x=np.arange(0,1, 0.01),
title="ROC Curves",
x_label="False-Positive Rate",
y_label="True-Positive Rate",
)
def visualise_results_clf(self, encodings):
"""
Creates a series of plots to visualise performance
results for a classification model.
"""
assert self._trained_model != None, "You haven't trained a model yet."
# Getting model
model = self._trained_model
# Getting training, test and predictions data
y_train = self._train_and_test_sets.get("y_train")
y_test = self._train_and_test_sets.get("y_test")
train_preds = self._train_preds
test_preds = self._test_preds
test_preds_prob = self._test_preds_prob
# Get key metric plot
key_metric_plot = self._get_key_metric_plot_clf(y_train, y_test, train_preds, test_preds)
# True values vs predictions
true_pred_plot = self._get_true_pred_plot(y_test, test_preds)
# Precision-recall curves
prec_rec_plot = self._get_precision_recall_plot(y_test, test_preds_prob, encodings)
# ROC curves
roc_plot = self._get_roc_plot(y_test, test_preds_prob, encodings)
# Combining plots
plots = {
(1,1,"Dataset",""): key_metric_plot,
(2,1,"","Class"): true_pred_plot,
(3,1,"Recall","Precision"): prec_rec_plot,
(3,2,"False-Positive Rate","True-Positive Rate"): roc_plot
}
subplot_titles = [
"Key Metrics",
"True vs Predicted Values",
"Precision-Recall Curves",
"ROC Curves",
]
specs = [
[{"type": "bar", "colspan": 2}, None],
[{"type": "xy", "colspan": 2}, None],
[{"type": "xy"}, {"type": "xy"}],
]
combined_plot = plot_collection(
plots,
rows=3,
cols=2,
subplot_titles=subplot_titles,
specs=specs,
title="Model Performance Results",
)
return combined_plot
def visualise_results_reg(self):
"""
Creates a series of plots to visualise performance
results for a regression model.
"""
assert self._trained_model != None, "You haven't trained a model yet."
# Getting model
model = self._trained_model
# Getting training, test and predictions data
y_train = self._train_and_test_sets.get("y_train")
y_test = self._train_and_test_sets.get("y_test")
train_preds = self._train_preds
test_preds = self._test_preds
# Get key metric plot
key_metric_plot = self._get_key_metric_plot_reg(y_train, y_test, train_preds, test_preds)
# True values vs predictions
true_pred_plot = self._get_true_pred_plot(y_test, test_preds)
# Combining plots
plots = {
(1,1,"Dataset",""): key_metric_plot,
(2,1,"","Productivity"): true_pred_plot,
}
subplot_titles = ["Key Metrics","True vs Predicted Values"]
specs = [
[{"type": "bar"}],
[{"type": "xy"}],
]
combined_plot = plot_collection(
plots,
rows=2,
cols=1,
subplot_titles=subplot_titles,
specs=specs,
title="Model Performance Results",
height=900
)
return combined_plot
# Productivity dataset; using optimal configuration as determined in Task3-1
gwp_dsm = DatasetManager("gwp_assessment")
gwp_dsm.load_and_preprocess([0,1,2,3], "iterative")
gwp_dsm.create_feature_set(7)
gwp_dsm.scale_feature_set()
# Star dataset; using optimal configuration as determined in Task3-1
star_dsm = DatasetManager("star_assessment")
star_dsm.load_and_preprocess([0,1,8,9,12,16,17], "knn")
star_dsm.create_feature_set(8)
star_dsm.scale_feature_set()
Dataset loaded... Dataset cleaned.. Dataset encodings.. Dataset numerised... Missing values imputed... Dataset loaded... Dataset cleaned.. Dataset encodings.. Dataset numerised... Missing values imputed...
# Productivity dataset
gwp_features = gwp_dsm.get_scaled_feat_ds()
gwp_targets = gwp_dsm.get_complete_ds()[:, -1]
# Star dataset
star_features = star_dsm.get_scaled_feat_ds()
star_targets = star_dsm.get_complete_ds()[:, -1]
# GWP dataset
gwp_mm = ModelManager(gwp_features, gwp_targets)
# Star dataset
star_mm = ModelManager(star_features, star_targets)
Methodology
Evaluation metrics
Productivity dataset: accuracy, precision, recall, F1 score. These metrics are ideal metrics for evaluating classification models as they provide comprehensive insight into a model's performance. Accuracy helps understand the overall effectiveness of the model. However, it can be misleading in imbalanced datasets, which is where precision and recall come in. They provide a more nuanced view of the model's ability to correctly identify positive instances and avoid false positives. The F1 score harmonises precision and recall, offering a single metric that seeks a balance between these two characteristics, making it especially useful when the costs of false positives and false negatives are significantly different.
Star dataset: mean squared error (MSE), mean abolute error (MAE), R2 score. These are robust metrics for evaluating regression models, with each illuminating different aspects of model performance. MSE emphasizes larger errors by squaring residuals, making it useful when larger errors are undesirable. MAE provides a more straightforward measure of average error magnitude, regardless of direction. The R2 score complements these by providing a relative measure of how much variance the model can explain, giving a broader picture of model performance beyond just raw error. These combined provide a comprehensive assessment of the model's effectiveness.
Notes
# Splitting productivity dataset
gwp_mm.split_dataset(train_size=0.8, test_size=0.2)
# Splitting star dataset
star_mm.split_dataset(train_size=0.016, test_size=0.004)
# Productivity dataset
gwp_mm.train_model("reg", 10)
Regression model initialised... Fitting model and performing cross-validation... Model fitting and cross-validation complete... Making predictions...
# Star dataset
star_mm.train_model("clf", 10)
Classifier model initialised... Fitting model and performing cross-validation... Model fitting and cross-validation complete... Making predictions...
# Productivity dataset
gwp_mm.visualise_results_reg()
# Star dataset
encodings = star_dsm.get_encodings()[17]
star_mm.visualise_results_clf(encodings)
Productivity dataset
Star dataset
# Splitting productivity dataset
gwp_mm.split_dataset(train_size=0.75, test_size=0.25)
# Splitting star dataset
star_mm.split_dataset(train_size=0.015, test_size=0.005)
# Productivity dataset
gwp_mm.train_model("reg", 10)
Regression model initialised... Fitting model and performing cross-validation... Model fitting and cross-validation complete... Making predictions...
# Star dataset
star_mm.train_model("clf", 10)
Classifier model initialised... Fitting model and performing cross-validation... Model fitting and cross-validation complete... Making predictions...
# Productivity dataset
gwp_mm.visualise_results_reg()
# Star dataset
encodings = star_dsm.get_encodings()[17]
star_mm.visualise_results_clf(encodings)
Productivity dataset
Star dataset
# Splitting productivity dataset
gwp_mm.split_dataset(train_size=0.7, test_size=0.3)
# Splitting star dataset
star_mm.split_dataset(train_size=0.014, test_size=0.006)
# Productivity dataset
gwp_mm.train_model("reg", 10)
Regression model initialised... Fitting model and performing cross-validation... Model fitting and cross-validation complete... Making predictions...
# Star dataset
star_mm.train_model("clf", 10)
Classifier model initialised... Fitting model and performing cross-validation... Model fitting and cross-validation complete... Making predictions...
# Productivity dataset
gwp_mm.visualise_results_reg()
# Star dataset
encodings = star_dsm.get_encodings()[17]
star_mm.visualise_results_clf(encodings)
Productivity dataset
Star dataset
# Splitting productivity dataset
gwp_mm.split_dataset(train_size=0.6, test_size=0.4)
# Splitting star dataset
star_mm.split_dataset(train_size=0.012, test_size=0.008)
# Productivity dataset
gwp_mm.train_model("reg", 10)
Regression model initialised... Fitting model and performing cross-validation... Model fitting and cross-validation complete... Making predictions...
# Star dataset
star_mm.train_model("clf", 10)
Classifier model initialised... Fitting model and performing cross-validation... Model fitting and cross-validation complete... Making predictions...
# Productivity dataset
gwp_mm.visualise_results_reg()
# Star dataset
encodings = star_dsm.get_encodings()[17]
star_mm.visualise_results_clf(encodings)
Productivity dataset
Star dataset
# Splitting productivity dataset
gwp_mm.split_dataset(train_size=0.5, test_size=0.5)
# Splitting star dataset
star_mm.split_dataset(train_size=0.01, test_size=0.01)
# Productivity dataset
gwp_mm.train_model("reg", 10)
Regression model initialised... Fitting model and performing cross-validation... Model fitting and cross-validation complete... Making predictions...
# Star dataset
star_mm.train_model("clf", 10)
Classifier model initialised... Fitting model and performing cross-validation... Model fitting and cross-validation complete... Making predictions...
# Productivity dataset
gwp_mm.visualise_results_reg()
# Star dataset
encodings = star_dsm.get_encodings()[17]
star_mm.visualise_results_clf(encodings)
Productivity dataset
Star dataset
Productivity dataset
Star dataset
The targets (i.e. the values we're trying to predict) for the star dataset all belong to one of a discrete set of classes. Linear regression isn't ideal for predicting discrete classes; rather it is designed for continuous outputs. It calculates the best-fitting line through a set of data and predicts values (which can be any number in the set of real numbers) along that line. When working with discrete classes, these predictions often don't make sense. E.g., if we're predicting classes labeled 0, 1, and 2, linear regression might give us 1.2785, which doesn't correspond to any class. Futhermore, linear regression is sensitive to outliers, which can significantly skew the best-fitting line and hence, the predictions. In classification problems, this can lead to many misclassifications.